##Introduction The goal of this project is to predict the manner in which they did the exercise. This is the “classe” variable in the training set.
# Data Setes Dtasetes are avialable on the belove-mentioned links:-
For tarining- https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv
For test- https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv
I am going to use the below-mentioned R- Packages for the EDA, datacleaning and model buildings-
require(knitr)
## Loading required package: knitr
require(caret)
## Loading required package: caret
## Warning: package 'caret' was built under R version 4.0.4
## Loading required package: lattice
## Loading required package: ggplot2
require(rpart)
## Loading required package: rpart
require(rpart.plot)
## Loading required package: rpart.plot
## Warning: package 'rpart.plot' was built under R version 4.0.4
require(randomForest)
## Loading required package: randomForest
## Warning: package 'randomForest' was built under R version 4.0.5
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
require(caTools)
## Loading required package: caTools
require(Amelia)
## Loading required package: Amelia
## Warning: package 'Amelia' was built under R version 4.0.5
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.6, built: 2019-11-24)
## ## Copyright (C) 2005-2021 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
require(devtools)
## Loading required package: devtools
## Loading required package: usethis
require(ggcorrplot)
## Loading required package: ggcorrplot
## Warning: package 'ggcorrplot' was built under R version 4.0.5
require(plotly)
## Loading required package: plotly
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
require(correlationfunnel)
## Loading required package: correlationfunnel
## Warning: package 'correlationfunnel' was built under R version 4.0.5
## == correlationfunnel Tip #3 ====================================================
## Using `binarize()` with data containing many columns or many rows can increase dimensionality substantially.
## Try subsetting your data column-wise or row-wise to avoid creating too many columns.
## You can always make a big problem smaller by sampling. :)
##Loading Data
# file.choose function help to load the data from anywhere in your system
tr_data <- read.csv(file.choose(),stringsAsFactors = T, header = T )
ts_data <- read.csv(file.choose(),stringsAsFactors = T, header = T )
missmap(tr_data) # To check the missing values
missmap(ts_data)
dim(tr_data)
## [1] 19622 160
dim(ts_data)
## [1] 20 160
##Data Cleansing
After checking the datasets, i have found that there are some missing values in both datasets,and some of variables having Nearly Zero Variance. Now we need to
nzv <- nearZeroVar(tr_data)
tr_data <- tr_data[, -nzv]
ts_data <- ts_data[, -nzv]
dim(tr_data)
## [1] 19622 100
dim(ts_data)
## [1] 20 100
AllNA <- sapply(tr_data, function(x) mean(is.na(x))) > 0.95 #Removing Variables which are having NA values, Our threshhold is 95%
tr_data <- tr_data[, AllNA==FALSE]
dim(tr_data)
## [1] 19622 59
ts_data <- ts_data[, AllNA==FALSE]
dim(ts_data)
## [1] 20 59
#Removing the first 7 Variables which are Non-Numeric.
tr_data <- tr_data[, 8:59]
ts_data <- ts_data[, 8:59]
dim(tr_data)
## [1] 19622 52
dim(ts_data)
## [1] 20 52
colnames(tr_data)
## [1] "pitch_belt" "yaw_belt" "total_accel_belt"
## [4] "gyros_belt_x" "gyros_belt_y" "gyros_belt_z"
## [7] "accel_belt_x" "accel_belt_y" "accel_belt_z"
## [10] "magnet_belt_x" "magnet_belt_y" "magnet_belt_z"
## [13] "roll_arm" "pitch_arm" "yaw_arm"
## [16] "total_accel_arm" "gyros_arm_x" "gyros_arm_y"
## [19] "gyros_arm_z" "accel_arm_x" "accel_arm_y"
## [22] "accel_arm_z" "magnet_arm_x" "magnet_arm_y"
## [25] "magnet_arm_z" "roll_dumbbell" "pitch_dumbbell"
## [28] "yaw_dumbbell" "total_accel_dumbbell" "gyros_dumbbell_x"
## [31] "gyros_dumbbell_y" "gyros_dumbbell_z" "accel_dumbbell_x"
## [34] "accel_dumbbell_y" "accel_dumbbell_z" "magnet_dumbbell_x"
## [37] "magnet_dumbbell_y" "magnet_dumbbell_z" "roll_forearm"
## [40] "pitch_forearm" "yaw_forearm" "total_accel_forearm"
## [43] "gyros_forearm_x" "gyros_forearm_y" "gyros_forearm_z"
## [46] "accel_forearm_x" "accel_forearm_y" "accel_forearm_z"
## [49] "magnet_forearm_x" "magnet_forearm_y" "magnet_forearm_z"
## [52] "classe"
To check the correlation among the predictors
corr <- round(cor(tr_data[,-52]),1)
plot1 <- ggcorrplot(corr, hc.order = TRUE, type = "lower",
outline.col = "white",
ggtheme = ggplot2::theme_gray,
colors = c("#6D9EC1", "white", "#E46726"))
ggplotly(plot1)
## Correlation with classe__A
tr_data %>% binarize(n_bins = 4,thresh_infreq = 0.01) %>%
correlate(target = classe__A) %>%
plot_correlation_funnel(interactive = T,limits =
c(-0.5,0.5))
## Correlation with classe__B
tr_data %>% binarize(n_bins = 4,thresh_infreq = 0.01) %>%
correlate(target = classe__B) %>%
plot_correlation_funnel(interactive = T,limits =
c(-0.5,0.5))
## Correlation with classe__C
tr_data %>% binarize(n_bins = 4,thresh_infreq = 0.01) %>%
correlate(target = classe__C) %>%
plot_correlation_funnel(interactive = T,limits =
c(-0.5,0.5))
## Correlation with classe__D
tr_data %>% binarize(n_bins = 4,thresh_infreq = 0.01) %>%
correlate(target = classe__D) %>%
plot_correlation_funnel(interactive = T,limits =
c(-0.5,0.5))
## Correlation with classe__E
tr_data %>% binarize(n_bins = 4,thresh_infreq = 0.01) %>%
correlate(target = classe__E) %>%
plot_correlation_funnel(interactive = T,limits =
c(-0.5,0.5))
##Data Partitioning
For the cross validation or to check the accuracy of our model we need to divide training dataset further. For this i am foing to use CAtools package.
sample = sample.split(tr_data, SplitRatio = .75)
train_dt = subset(tr_data, sample == TRUE)
test_dt = subset(tr_data, sample == FALSE)
set.seed(123)
fit <- rpart(classe ~ ., data = train_dt, method="class")
pred <- predict(fit,test_dt,type = "class")
confusionMatrix(pred,test_dt$classe)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1217 136 7 31 34
## B 48 516 39 62 120
## C 29 77 622 111 130
## D 77 171 130 559 114
## E 23 50 57 42 503
##
## Overall Statistics
##
## Accuracy : 0.6966
## 95% CI : (0.6836, 0.7095)
## No Information Rate : 0.2842
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6168
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.8730 0.5432 0.7275 0.6944 0.5583
## Specificity 0.9408 0.9320 0.9143 0.8800 0.9570
## Pos Pred Value 0.8540 0.6573 0.6419 0.5319 0.7452
## Neg Pred Value 0.9491 0.8947 0.9408 0.9362 0.9059
## Prevalence 0.2842 0.1937 0.1743 0.1641 0.1837
## Detection Rate 0.2481 0.1052 0.1268 0.1140 0.1025
## Detection Prevalence 0.2905 0.1600 0.1976 0.2143 0.1376
## Balanced Accuracy 0.9069 0.7376 0.8209 0.7872 0.7577
As we can see that accuracy lavel of decision tree is 70%,which is not upto the desired level. So we need to check other models to compare the accuracy with this model.
##Random Forest Model
set.seed(123)
rf_fit <- randomForest(classe ~ ., data = train_dt,mtyr=7)
rf_pred <- predict(rf_fit,test_dt)
cnf_rf <- confusionMatrix(rf_pred,test_dt$classe)
plot(cnf_rf$table,col =cnf_rf$byClass,color="blue",main=paste("Random Forest-Accuracy=",
round(cnf_rf$overall['Accuracy'], 4)))
## Warning: In mosaicplot.default(x, xlab = xlab, ylab = ylab, ...) :
## extra argument 'col' will be disregarded
After checking the Overall Statistics data, the Random Forest model has definitely more accuracy than Decision tree model. Hence we will be selecting Random Forest model for final prediction
rf_ts_pred <- predict(rf_fit,ts_data)
rf_ts_pred
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## B A B A A E D B A A B C B A E E A B B B
## Levels: A B C D E